Program X

String Manipulation

  • It can be helpful to transform strings
    • Sometimes your data is text-based
    • Text also appears in tables and figures
    • So we may want to customize it quickly
  • Tidyverse contains tools for this

String Manipulation Live Coding

# SETUP: We will need tidyverse for almost all of these functions

library(tidyverse)

# ==============================================================================

# USECASE: Re-capitalize strings

x <- c("R4SS: Introduction to R for Social Scientists")

str_to_lower(x)

str_to_upper(x)

str_to_sentence(x)

str_to_title(x)

# ==============================================================================

# USECASE: Extract subsets of strings

x <- c("Apple", "Banana", "Pear")

str_sub(x, start = 1, end = 3)

str_sub(x, start = -3, end = -1)

str_sub(x, start = 2, end = -2)

str_sub(x, start = 1, end = 5) # can go beyond the end

# ==============================================================================

# USECASE: Remove whitespace from strings

x <- "  Sometimes strings have   too   much white space "
x

str_trim(x) # remove white space at the start and end

str_squish(x) # trim and then collapse inner white space

# ==============================================================================

# USECASE: Remove and replace patterns in strings

x <- "Scientists very often utilize very fancy words,
      even when they could utilize simpler ones."

str_remove(x, pattern = "very ") # removes first pattern match only

str_remove_all(x, pattern = "very ") # removes all pattern matches

str_replace(x, pattern = "utilize", replacement = "use")

str_replace_all(x, pattern = "utilize", replacement = "use")

# NOTE: More complex patterns can be found using regular expressions (regex)

# ==============================================================================

# USECASE: Create a string manipulation pipeline

x_clean <-
  x |>
  str_remove_all("very ") |>
  str_replace_all("utilize", "use") |>
  print()

::: footer Extra Slides :::I=

If Else

  • A locked door behaves conditionally
    • If you have the key, then open up…
    • Otherwise, stay closed…
  • Sometimes we want code to behave conditionally
    • Filter retains observations conditionally
      (e.g., if it meets a condition, it gets to stay)
    • Let’s learn to transform variables conditionally
    • We can use if_else() for simple examples

If Else Live Coding

# SETUP: We will need tidyverse for almost all of these functions

library(tidyverse)

# ==============================================================================

# USECASE: Determining whether someone can vote in the US

age <- 12

age_group <-
  if_else(
    condition = age >= 18,
    true = "adult",
    false = "child"
  ) |>
  print()

# ==============================================================================

# TIP: Because argument names are optional, we can shorten this (if we want)

age_group <- if_else(age >= 18, "adult", "child") |> print()

# ==============================================================================

# LESSON: This function is particularly useful applied to vectors

ages <- c(13, 18, 14, 19, 22, 16)

age_groups <- if_else(ages >= 18, "adult", "child") |> print()

# ==============================================================================

# USECASE: We can therefore use it during data wrangling

cereal <- read_csv("cereal.csv", na = "-999")

cereal2 <- mutate(cereal, popular = if_else(rating > 50, "yes", "no"))
cereal2

cereal3 <-
  cereal |>
  mutate(
    diabetes = if_else(
      condition = sugars == 0,
      true = "sugar-free",
      false = "contains sugar"
    )
  ) |>
  print()

Case When

  • An elevator also behaves conditionally
    • If you press a button, then it goes to that floor
    • There are usually more than just two buttons
    • In this analogy (but not in real life), the elevator only responds to the first button pressed
  • Sometimes we want code to behave this way
    • case_when() expands upon if_else()
    • It can have multiple conditions (floor buttons)
    • The first condition met “wins” (picks the floor)

Case When Live Coding

# SETUP: We will need tidyverse for almost all of these functions

library(tidyverse)

# ==============================================================================

# USECASE: Determine what types of movies your kids can watch

ages <- c(11, 13, 18)

movies_allowed <-
  case_when(
    ages >= 17 ~ "R",
    ages >= 13 ~ "PG-13",
    ages < 13 ~ "PG"
  ) |>
  print()

# ==============================================================================

# PITFALL: Don't put the least restrictive condition first

age <- 18

movies_allowed2 <-
  case_when(
    age < 13 ~ "PG",
    age >= 13 ~ "PG-13",
    age >= 17 ~ "R"
  ) |>
  print() # age >= 13, so PG-13 wins before checking if age >= 17

# ==============================================================================

# USECASE: Use case_when to re-code variables during data wrangling

starwars

sw <-
  starwars |>
  mutate(
    species3 = case_when(
      species == "Human" ~ "Human",
      species == "Droid" ~ "Droid",
      species != "Human" & species != "Droid" ~ "Alien"
    )
  ) |>
  select(name, species3) |>
  print()

# ==============================================================================

# TIP: The next version of case_when() will add the .default argument
# This is where the elevator will drop you off if you hit no buttons

sw <-
  starwars |>
  mutate(
    species3 = case_when(
      species == "Human" ~ "Human",
      species == "Droid" ~ "Droid",
      .default = "Alien"
    )
  ) |>
  select(name, species3) |>
  print()

# NOTE: The above code won't work now, but it should in a few weeks/months
# For now, you can use TRUE ~ "Alien" and it works but is harder to explain

Wrangle X

Across

  • We can use across() to repeat an operation across multiple variables in a tibble
    • This makes our code shorter
    • It is faster to read and write
    • It is also less error-prone
  • So we can repeat a function in order to…
    • mutate() multiple variables
    • summarize() multiple variables

Across Live Coding

# SETUP: We will need tidyverse and an example dataset

library(tidyverse)

starwars

# ==============================================================================

# USECASE: Applying the same mutation to multiple variables is a pain

sw <- 
  starwars |> 
  mutate(
    hair_color = factor(hair_color),
    skin_color = factor(skin_color),
    eye_color = factor(eye_color)
  ) |> 
  print() # before

sw <- 
  starwars |> 
  mutate(
    across(
      .cols = c(hair_color, skin_color, eye_color), 
      .fns = factor
    )
  ) |> 
  print() #after

# ==============================================================================

# PITFALL: Don't forget to wrap the .cols part in c()

sw <- 
  starwars |> 
  mutate(
    across(
      .cols = mass, birth_year, 
      .fns = round,
      digits = 1
    )
  ) |> 
  print() # error

# ==============================================================================

# LESSON: To pass arguments to the inner function, add them inside across()

sw <- 
  starwars |> 
  mutate(
    across(
      .cols = c(mass, birth_year), 
      .fns = round,
      digits = 1
    )
  ) |> 
  print()

# ==============================================================================

# USECASE: You can also apply the same summary functions across variables

sw <- 
  starwars |> 
  summarize(
    height = mean(height, na.rm = TRUE),
    mass = mean(mass, na.rm = TRUE),
    birth_year = mean(birth_year, na.rm = TRUE)
  ) |> 
  print()

sw <- 
  starwars |> 
  summarize(
    across(
      .cols = c(height, mass, birth_year), 
      .fns = mean, 
      na.rm = TRUE
    )
  ) |> 
  print()

Separate and Unite

  • Tidy data needs one value per cell
  • So we may need to separate cells
    • e.g., What was the model of my first car?
    • "Nissan Altima 2003"
    • "Nissan" "Altima" "2003"
  • But some tasks require us to unite cells
    • e.g., What address should I mail to?
    • 123 "Main Street"
    • "123 Main Street"

Separate Live Coding

# SETUP: We will need tidyverse and an example dataset

library(tidyverse)

# Create some example data
dat <- 
  tibble(
    id = c("A_001_01", "A_002_01", "B_001_01", "B_002_01", "C_001_01", "C_002_01"),
    duration = c("01:16", "01:21", "01:49", "00:34", "00:32", "00:54")
  ) |> 
  print()

# ==============================================================================

# USECASE: Separate a column into multiple columns

dat2 <- 
  dat |> 
  separate(
    col = duration, 
    into = c("min", "sec"), 
    sep = ":"
  ) |> 
  print()

# ==============================================================================

# USECASE: This also works with more than two "into" columns

dat2 <- 
  dat |>  
  separate(
    col = id, 
    into = c("group", "subject", "time"), 
    sep = "_"
  ) |> 
  print()

# ==============================================================================

# TIP: To automatically convert strings into numbers, use convert

dat2 <- 
  dat |> 
  separate(
    col = id, 
    into = c("group", "subject", "time"), 
    sep = "_", 
    convert = TRUE
  ) |> 
  print()

# ==============================================================================

# PITFALL: Don't forget to close the "into" vector's parentheses

dat2 <- 
  dat|> 
  separate(col = duration, into = c("min", "sec", sep = ":")) #error

Unite Live Coding

# SETUP: We will need tidyverse and an example dataset

library(tidyverse)

dat2 <- 
  dat|> 
  separate(col = id, into = c("group", "subject", "time"), sep = "x") |> 
  separate(col = duration, into = c("min", "sec"), sep = ":") |> 
  print()

# ==============================================================================

# USECASE: Unite multiple columns into one string

dat3 <- 
  dat2 |> 
  unite(col = "newid", group, subject, time, sep = "-") |> 
  unite(col = "duration", min, sec, sep = ":") |> 
  print()

# ==============================================================================

# LESSON: Retain the columns being united with remove = FALSE

dat3 <- 
  dat2 |> 
  unite(col = "newid", group:time, sep = "", remove = FALSE) |> 
  print()

Visualize X

Describe this Graphic 1

Data

  • starwars {tidyverse}

Aesthetics/Scales

  • height to X (continuous)
  • mass to Y (continuous)

Geoms

  • Point (dots)
  • Smooth (local)

Describe this Graphic 2

Data

  • mpg {tidyverse}

Aesthetics/Scales

  • displ to X (continuous)
  • hwy to Y (continuous)
  • drv to color (discrete)

Geoms

  • Point (dots)
  • Smooth (linear)

Describe this Graphic 3

Data

  • mpg {tidyverse}

Aesthetics/Scales

  • hwy to X (continuous)
  • class to Y (discrete)

Geoms

  • Boxplot (fill = lightblue)
  • VLine (xintercept = 20)

Describe this Graphic 4

Data

  • flights {nycflights13}

Aesthetics/Scales

  • origin to X (discrete)
  • origin to color (discrete)
  • count to Y (stat from geom)

Geoms

  • Bar (fill = white)